The goal of this script is to generate a Seurat object for sample GSM3717037.

  • removal of cells based on quality control metrics
  • normalization with LogNormalize, then doublets detection using scran hybrid and scDblFinder method (but not filtering)
  • normalization with LogNormalize, for only the remaining cells
  • cell cycle and cell type annotation
  • dimensionality reduction using PCA
  • projection using tSNE and UMAP
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
## [1] "/usr/local/lib/R/library"

Preparation

In this section, we set the global settings of the analysis. We will store data there :

out_dir = "."

We load the parameters :

sample_name = params$sample_name # "GSM3717034"
# sample_name = "GSM3717034"

Input count matrix is there :

count_matrix_dir = paste0(out_dir, "/input/", sample_name, "/")
count_matrix_file = list.files(count_matrix_dir, full.names = TRUE)
count_matrix_file
## [1] "./input/GSM3717037//GSM3717037_10X1data.HFU13.tsv.gz"

We load the markers and specific colors for each cell type :

cell_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
##      CD4 T cells      CD8 T cells Langerhans cells      macrophages 
##               13               13                9               10 
##          B cells          cuticle           cortex          medulla 
##               16               15               16               10 
##              IRS    proliferative              IBL              ORS 
##               16               20               15               16 
##              IFE             HFSC      melanocytes        sebocytes 
##               17               17               10                8

Here are custom colors for each cell type :

color_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())

We load markers to display on the dotplot :

dotplot_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
## $`CD4 T cells`
## [1] "CD3E" "CD4" 
## 
## $`CD8 T cells`
## [1] "CD3E" "CD8A"
## 
## $`Langerhans cells`
## [1] "CD207" "CPVL" 
## 
## $macrophages
## [1] "TREM2" "MSR1" 
## 
## $`B cells`
## [1] "CD79A" "CD79B"
## 
## $cuticle
## [1] "KRT32" "KRT35"
## 
## $cortex
## [1] "KRT31" "PRR9" 
## 
## $medulla
## [1] "BAMBI"   "ADLH1A3"
## 
## $IRS
## [1] "KRT71" "KRT73"
## 
## $proliferative
## [1] "TOP2A" "MCM5" 
## 
## $IBL
## [1] "KRT16" "KRT6C"
## 
## $ORS
## [1] "KRT15" "GPX2" 
## 
## $IFE
## [1] "SPINK5" "LY6D"  
## 
## $HFSC
## [1] "DIO2"   "TCEAL2"
## 
## $melanocytes
## [1] "DCT"   "MLANA"
## 
## $sebocytes
## [1] "CLMP"  "PPARG"

We load metadata for this sample :

sample_info = readRDS(paste0(out_dir, "/../1_metadata/takahashi_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
##   project_name  sample_type sample_identifier          color
## 1   GSM3717037 Takahashi_HD    Takahashi_HD_4 darkolivegreen

These is a parameter for different functions :

cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 0.5  # almost no filter
cut_nFeature_RNA = 20     # almost no filter
cut_percent.mt = 20
cut_percent.rb = 50

Load count matrix

In this section, we load the count matrix.

mat = read.table(count_matrix_file,
                 header = TRUE, row.names = 1)

# For the two 10X data, this is required
rownames(mat) = stringr::str_remove(rownames(mat),
                                    pattern = "hg19_")

# Seurat object
sobj = Seurat::CreateSeuratObject(counts = mat,
                                  project = sample_name,
                                  assay = "RNA")
rm(mat)
sobj
## An object of class Seurat 
## 32738 features across 6000 samples within 1 assay 
## Active assay: RNA (32738 features, 0 variable features)

(Time to run : 64.1 s)

We add the same columns as in metadata :

row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]

colnames(sobj@meta.data)
## [1] "orig.ident"        "nCount_RNA"        "nFeature_RNA"     
## [4] "project_name"      "sample_identifier" "sample_type"

Before filtering

Normalization

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 32738 features across 6000 samples within 1 assay 
## Active assay: RNA (32738 features, 3000 variable features)

Projection

We generate a tSNE to visualize cells before filtering.

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"),
                       check_duplicates = FALSE)

sobj
## An object of class Seurat 
## 32738 features across 6000 samples within 1 assay 
## Active assay: RNA (32738 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Cell type

We annotate cells for cell type using Seurat::AddModuleScore function.

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
## 
##      CD4 T cells      CD8 T cells Langerhans cells      macrophages 
##               86               32               69               54 
##          B cells          cuticle           cortex          medulla 
##               65              233              126              102 
##              IRS    proliferative              IBL              ORS 
##              190              240              756              744 
##              IFE             HFSC      melanocytes        sebocytes 
##             1561             1239              375              128

(Time to run : 31.63 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle phase

We annotate cells for cell cycle phase using Seurat and cyclone.

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##   G1  G2M    S 
## 4991  478  443
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##      
##         G1  G2M    S
##   G1  3337  278  285
##   G2M  761  160   83
##   S    893   40   75

(Time to run : 467.45 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")
sobj$log_nCount_RNA = log(sobj$nCount_RNA)

head(sobj@meta.data)
##                  orig.ident nCount_RNA nFeature_RNA project_name
## AAACCTGAGAAACCGC GSM3717037       1680          821   GSM3717037
## AAACCTGAGATATACG GSM3717037        139          112   GSM3717037
## AAACCTGCAAAGTCAA GSM3717037       9496         2666   GSM3717037
## AAACCTGCACGGCCAT GSM3717037       9837         2388   GSM3717037
## AAACCTGCAGCTGCAC GSM3717037      15679         3277   GSM3717037
## AAACCTGTCAACGGCC GSM3717037        154          136   GSM3717037
##                  sample_identifier  sample_type score_CD4_T_cells
## AAACCTGAGAAACCGC    Takahashi_HD_4 Takahashi_HD       -0.04562790
## AAACCTGAGATATACG    Takahashi_HD_4 Takahashi_HD        0.00000000
## AAACCTGCAAAGTCAA    Takahashi_HD_4 Takahashi_HD       -0.05167898
## AAACCTGCACGGCCAT    Takahashi_HD_4 Takahashi_HD       -0.05105475
## AAACCTGCAGCTGCAC    Takahashi_HD_4 Takahashi_HD        0.03635953
## AAACCTGTCAACGGCC    Takahashi_HD_4 Takahashi_HD        0.00000000
##                  score_CD8_T_cells score_Langerhans_cells score_macrophages
## AAACCTGAGAAACCGC      -0.023382583            -0.01671626      -0.021117268
## AAACCTGAGATATACG       0.000000000             0.00000000       0.000000000
## AAACCTGCAAAGTCAA      -0.033907911            -0.03393119      -0.022653710
## AAACCTGCACGGCCAT      -0.032718647            -0.03883237      -0.024568747
## AAACCTGCAGCTGCAC       0.005699966             0.02285598      -0.015237188
## AAACCTGTCAACGGCC       0.000000000            -0.01142884      -0.009465922
##                  score_B_cells score_cuticle score_cortex score_medulla
## AAACCTGAGAAACCGC   -0.02327732   -0.16888571  -0.10268153  -0.065967585
## AAACCTGAGATATACG    0.00000000    0.26509648   0.59596144  -0.004515443
## AAACCTGCAAAGTCAA   -0.02031960   -0.08818440  -0.13095755  -0.088541109
## AAACCTGCACGGCCAT   -0.01720601   -0.07301297  -0.12655003  -0.087404566
## AAACCTGCAGCTGCAC   -0.01566195   -0.10465005  -0.05183572  -0.084598266
## AAACCTGTCAACGGCC    0.00000000   -0.04221001  -0.03619432  -0.008818253
##                    score_IRS score_proliferative  score_IBL   score_ORS
## AAACCTGAGAAACCGC -0.17210597         -0.06791198  0.1189465  0.29244213
## AAACCTGAGATATACG -0.03225920         -0.01198232  0.1811686  0.18997492
## AAACCTGCAAAGTCAA -0.14062855         -0.08836367 -0.2514209  0.69912468
## AAACCTGCACGGCCAT -0.06395347         -0.04368462  0.8356216 -0.40732036
## AAACCTGCAGCTGCAC -0.19713934          0.03724028 -0.1012079 -0.04019209
## AAACCTGTCAACGGCC -0.05455243         -0.02047534  0.2062756 -0.15128391
##                    score_IFE  score_HFSC score_melanocytes score_sebocytes
## AAACCTGAGAAACCGC -0.20489837  0.27047211       -0.23033721      0.10534018
## AAACCTGAGATATACG -0.16856793  0.15495378       -0.06215034     -0.02152909
## AAACCTGCAAAGTCAA -0.14364016  0.33448522       -0.26051961     -0.12005067
## AAACCTGCACGGCCAT -0.34860765 -0.09168565       -0.05075498     -0.15163047
## AAACCTGCAGCTGCAC -0.06312317 -0.22288526       -0.32111717     -0.09937174
## AAACCTGTCAACGGCC -0.18136702 -0.09937170       -0.05416930     -0.01662678
##                  cell_type Seurat.Phase cyclone.Phase percent.mt percent.rb
## AAACCTGAGAAACCGC       ORS          G2M            G1  7.9166667   6.666667
## AAACCTGAGATATACG    cortex           G1            G1  0.7194245  35.251799
## AAACCTGCAAAGTCAA       ORS           G1             S  2.5484414  21.282645
## AAACCTGCACGGCCAT       IBL          G2M            G1  1.3418725  29.714344
## AAACCTGCAGCTGCAC       IFE          G2M            G1  1.7411825  29.351362
## AAACCTGTCAACGGCC       IBL           G1            G1  5.1948052  12.987013
##                  log_nCount_RNA
## AAACCTGAGAAACCGC       7.426549
## AAACCTGAGATATACG       4.934474
## AAACCTGCAAAGTCAA       9.158626
## AAACCTGCACGGCCAT       9.193906
## AAACCTGCAGCTGCAC       9.660078
## AAACCTGTCAACGGCC       5.036953

We get the cell barcodes for the failing cells :

fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()

Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))

fsobj
## An object of class Seurat 
## 32738 features across 6000 samples within 1 assay 
## Active assay: RNA (32738 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

fsobj = Seurat::NormalizeData(fsobj,
                              normalization.method = "LogNormalize",
                              assay = "RNA")

fsobj = Seurat::FindVariableFeatures(fsobj,
                                     assay = "RNA",
                                     nfeatures = 3000)
fsobj
## An object of class Seurat 
## 32738 features across 6000 samples within 1 assay 
## Active assay: RNA (32738 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

We identify doublet cells :

fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)
## [1] 32738  6000
## 
## FALSE  TRUE 
##  4942  1058 
## [12:12:22] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## 
## FALSE  TRUE 
##  4815  1185 
## 
## FALSE  TRUE 
##  4140  1860
fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)

(Time to run : 124.09 s)

We add the information in the non filtered Seurat object :

sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)

Quality control representation

We can visualize the 4 cells quality with a Venn diagram :

n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))

Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)

Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Number of features

To visualize the threshold for number of features, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)

Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)

Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)

Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the log_nCount_RNA by nFeature_RNA figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (doublets_consensus.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (scDblFinder.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (hybrid_score.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

Visualization as piechart

Do filtered cells belong to a particular cell type ?

sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")

Doublet cells status

We can compare doublet detection methods with a Venn diagram :

ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))

We visualize cells annotation for doublets :

plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)

What is the composition of doublet cells ? We just look at score for each cell type.

sobj$orig.ident.doublets = case_when(is.na(sobj$doublets_consensus.class) ~ "bad quality",
                                     sobj$doublets_consensus.class == TRUE ~ paste0(sobj$orig.ident, " doublets"),
                                     sobj$doublets_consensus.class == FALSE ~ "not doublet")
sobj$orig.ident.doublets = factor(sobj$orig.ident.doublets,
                                  levels = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                             "bad quality", "not doublet"))

doublets_compo = function(score1, score2) {
  type1 = unlist(lapply(stringr::str_split(score1, pattern = "score_"), `[[`, 2))
  type2 = unlist(lapply(stringr::str_split(score2, pattern = "score_"), `[[`, 2))
  
  if (type1 == type2) {
    the_title = "Homotypic doublet"
    the_subtitle = type1
    score1 = "log_nCount_RNA"
  } else {
    the_title = "Heterotypic doublet"
    the_subtitle = paste(type1, type2, sep = " + ")
  }
  
  p = sobj@meta.data %>%
    dplyr::arrange(desc(orig.ident.doublets)) %>%
    ggplot2::ggplot(., aes(x = eval(parse(text = score1)),
                           y = eval(parse(text = score2)),
                           col = orig.ident.doublets)) +
    ggplot2::geom_point(size = 0.25) +
    ggplot2::scale_color_manual(values = c(sample_info$color, "gray90", "gray60"),
                                breaks = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                           "bad quality", "not doublet")) +
    ggplot2::labs(x = score1, y = score2,
                  title = the_title, subtitle = the_subtitle) +
    ggplot2::theme_classic() +
    ggplot2::theme(aspect.ratio = 1,
                   plot.title = element_text(hjust = 0.5),
                   plot.subtitle = element_text(hjust = 0.5))
  
  return(p)
}
score_columns = grep(x = colnames(sobj@meta.data),
                     pattern = "^score",
                     value = TRUE)
combinations = expand.grid(score_columns, score_columns) %>%
  apply(., 1, sort) %>% t() %>%
  as.data.frame()
combinations = combinations[!duplicated(combinations), ]

plot_list = apply(combinations, 1, FUN = function(elem) {
  doublets_compo(elem[1], elem[2])
})

sobj$orig.ident.doublets = NULL
patchwork::wrap_plots(plot_list, ncol = 4) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
show

Save

We could save this object before filtering (remove eval = FALSE) :

saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))

Filtering

We remove :

  • cells with a number of UMI lower than 0.5
  • cells expressing a number of genes lower than 20
  • cells having more than 20 % of UMI related to mitochondrial genes
  • cells having more than 50 % of UMI related to ribosomal genes

Note: We do not filter cells detected as doublets. Indeed, few genes and transcripts are detected per cell, and the best cells are therefore annotated as doublets.

sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb)))
sobj
## An object of class Seurat 
## 32738 features across 5768 samples within 1 assay 
## Active assay: RNA (32738 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Post-filtering processing

Normalization

We normalize the count matrix for remaining cells :

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 32738 features across 5768 samples within 1 assay 
## Active assay: RNA (32738 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Projection

We perform a PCA :

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE and a UMAP with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"),
                       check_duplicates = FALSE)

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))

Annotation

We annotate cells for cell type, with the new normalized expression matrix :

score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
## 
##      CD4 T cells      CD8 T cells Langerhans cells      macrophages 
##               80               38               57               53 
##          B cells          cuticle           cortex          medulla 
##               62              209              118               93 
##              IRS    proliferative              IBL              ORS 
##              186              230              734              704 
##              IFE             HFSC      melanocytes        sebocytes 
##             1503             1215              370              116

(Time to run : 25.13 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle

We annotate cells for cell cycle phase :

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##   G1  G2M    S 
## 4905  413  368
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##      
##         G1  G2M    S
##   G1  3260  246  249
##   G2M  753  135   59
##   S    892   32   60

(Time to run : 444.22 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Clustering

We make a highly resolutive clustering :

sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5768
## Number of edges: 196760
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7884
## Number of communities: 28
## Elapsed time: 0 seconds
table(sobj$seurat_clusters)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
## 755 660 484 391 356 274 266 258 229 200 178 169 159 155 155 142 126 125 117  98 
##  20  21  22  23  24  25  26  27 
##  89  77  67  59  58  56  48  17

Visualization

Cell type

We can visualize the cell type :

tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Cell cycle

We can visualize the cell cycle, from Seurat :

tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

We can visualize the cell cycle, from cyclone :

tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Clusters

We visualize the clustering :

tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Gene expression

We visualize all cell types markers on the tSNE :

markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)

Save

We save the annotated and filtered Seurat object :

saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))

R session

show
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.5   patchwork_1.1.2 dplyr_1.0.7    
## 
## loaded via a namespace (and not attached):
##   [1] softImpute_1.4              graphlayouts_0.7.0         
##   [3] pbapply_1.4-2               lattice_0.20-41            
##   [5] haven_2.3.1                 vctrs_0.3.8                
##   [7] usethis_2.0.1               dynwrap_1.2.1              
##   [9] blob_1.2.1                  survival_3.2-13            
##  [11] prodlim_2019.11.13          dynutils_1.0.5             
##  [13] later_1.3.0                 DBI_1.1.1                  
##  [15] R.utils_2.11.0              SingleCellExperiment_1.8.0 
##  [17] rappdirs_0.3.3              uwot_0.1.8                 
##  [19] dqrng_0.2.1                 jpeg_0.1-8.1               
##  [21] zlibbioc_1.32.0             pspline_1.0-18             
##  [23] pcaMethods_1.78.0           mvtnorm_1.1-1              
##  [25] htmlwidgets_1.5.4           GlobalOptions_0.1.2        
##  [27] future_1.22.1               UpSetR_1.4.0               
##  [29] laeken_0.5.2                leiden_0.3.3               
##  [31] clustree_0.4.3              parallel_3.6.3             
##  [33] scater_1.14.6               irlba_2.3.3                
##  [35] DEoptimR_1.0-9              tidygraph_1.1.2            
##  [37] Rcpp_1.0.9                  readr_2.0.2                
##  [39] KernSmooth_2.23-17          carrier_0.1.0              
##  [41] promises_1.1.0              gdata_2.18.0               
##  [43] DelayedArray_0.12.3         limma_3.42.2               
##  [45] graph_1.64.0                RcppParallel_5.1.4         
##  [47] Hmisc_4.4-0                 fs_1.5.2                   
##  [49] RSpectra_0.16-0             fastmatch_1.1-0            
##  [51] ranger_0.12.1               digest_0.6.25              
##  [53] png_0.1-7                   sctransform_0.2.1          
##  [55] cowplot_1.0.0               DOSE_3.12.0                
##  [57] ggvenn_0.1.9                here_1.0.1                 
##  [59] TInGa_0.0.0.9000            ggraph_2.0.3               
##  [61] pkgconfig_2.0.3             GO.db_3.10.0               
##  [63] DelayedMatrixStats_1.8.0    gower_0.2.1                
##  [65] ggbeeswarm_0.6.0            iterators_1.0.12           
##  [67] DropletUtils_1.6.1          reticulate_1.26            
##  [69] clusterProfiler_3.14.3      SummarizedExperiment_1.16.1
##  [71] circlize_0.4.15             beeswarm_0.4.0             
##  [73] GetoptLong_1.0.5            xfun_0.35                  
##  [75] bslib_0.3.1                 zoo_1.8-10                 
##  [77] tidyselect_1.1.0            reshape2_1.4.4             
##  [79] purrr_0.3.4                 ica_1.0-2                  
##  [81] pcaPP_1.9-73                viridisLite_0.3.0          
##  [83] rtracklayer_1.46.0          rlang_1.0.2                
##  [85] hexbin_1.28.1               jquerylib_0.1.4            
##  [87] dyneval_0.9.9               glue_1.4.2                 
##  [89] RColorBrewer_1.1-2          matrixStats_0.56.0         
##  [91] stringr_1.4.0               lava_1.6.7                 
##  [93] europepmc_0.3               DESeq2_1.26.0              
##  [95] recipes_0.1.17              labeling_0.3               
##  [97] httpuv_1.5.2                class_7.3-17               
##  [99] BiocNeighbors_1.4.2         DO.db_2.9                  
## [101] annotate_1.64.0             jsonlite_1.7.2             
## [103] XVector_0.26.0              bit_4.0.4                  
## [105] mime_0.9                    aquarius_0.1.5             
## [107] Rsamtools_2.2.3             gridExtra_2.3              
## [109] gplots_3.0.3                stringi_1.4.6              
## [111] processx_3.5.2              gsl_2.1-6                  
## [113] bitops_1.0-6                cli_3.0.1                  
## [115] batchelor_1.2.4             RSQLite_2.2.0              
## [117] randomForest_4.6-14         tidyr_1.1.4                
## [119] data.table_1.14.2           rstudioapi_0.13            
## [121] org.Mm.eg.db_3.10.0         GenomicAlignments_1.22.1   
## [123] nlme_3.1-147                qvalue_2.18.0              
## [125] scran_1.14.6                locfit_1.5-9.4             
## [127] scDblFinder_1.1.8           listenv_0.8.0              
## [129] ggthemes_4.2.4              gridGraphics_0.5-0         
## [131] R.oo_1.24.0                 dbplyr_1.4.4               
## [133] BiocGenerics_0.32.0         TTR_0.24.2                 
## [135] readxl_1.3.1                lifecycle_1.0.1            
## [137] timeDate_3043.102           ggpattern_0.3.1            
## [139] munsell_0.5.0               cellranger_1.1.0           
## [141] R.methodsS3_1.8.1           proxyC_0.1.5               
## [143] visNetwork_2.0.9            caTools_1.18.0             
## [145] codetools_0.2-16            Biobase_2.46.0             
## [147] GenomeInfoDb_1.22.1         vipor_0.4.5                
## [149] lmtest_0.9-38               msigdbr_7.5.1              
## [151] htmlTable_1.13.3            triebeard_0.3.0            
## [153] lsei_1.2-0                  xtable_1.8-4               
## [155] ROCR_1.0-7                  BiocManager_1.30.10        
## [157] scatterplot3d_0.3-41        abind_1.4-5                
## [159] farver_2.0.3                parallelly_1.28.1          
## [161] RANN_2.6.1                  askpass_1.1                
## [163] GenomicRanges_1.38.0        RcppAnnoy_0.0.16           
## [165] tibble_3.1.5                ggdendro_0.1-20            
## [167] cluster_2.1.0               future.apply_1.5.0         
## [169] Seurat_3.1.5                dendextend_1.15.1          
## [171] Matrix_1.3-2                ellipsis_0.3.2             
## [173] prettyunits_1.1.1           lubridate_1.7.9            
## [175] ggridges_0.5.2              igraph_1.2.5               
## [177] RcppEigen_0.3.3.7.0         fgsea_1.12.0               
## [179] remotes_2.4.2               scBFA_1.0.0                
## [181] destiny_3.0.1               VIM_6.1.1                  
## [183] testthat_3.1.0              htmltools_0.5.2            
## [185] BiocFileCache_1.10.2        yaml_2.2.1                 
## [187] utf8_1.1.4                  plotly_4.9.2.1             
## [189] XML_3.99-0.3                ModelMetrics_1.2.2.2       
## [191] e1071_1.7-3                 foreign_0.8-76             
## [193] withr_2.5.0                 fitdistrplus_1.0-14        
## [195] BiocParallel_1.20.1         xgboost_1.4.1.1            
## [197] bit64_4.0.5                 foreach_1.5.0              
## [199] robustbase_0.93-9           Biostrings_2.54.0          
## [201] GOSemSim_2.13.1             rsvd_1.0.3                 
## [203] memoise_2.0.0               evaluate_0.18              
## [205] forcats_0.5.0               rio_0.5.16                 
## [207] geneplotter_1.64.0          tzdb_0.1.2                 
## [209] caret_6.0-86                ps_1.6.0                   
## [211] DiagrammeR_1.0.6.1          curl_4.3                   
## [213] fdrtool_1.2.15              fansi_0.4.1                
## [215] highr_0.8                   urltools_1.7.3             
## [217] xts_0.12.1                  GSEABase_1.48.0            
## [219] acepack_1.4.1               edgeR_3.28.1               
## [221] checkmate_2.0.0             scds_1.2.0                 
## [223] cachem_1.0.6                npsurv_0.4-0               
## [225] babelgene_22.3              rjson_0.2.20               
## [227] openxlsx_4.1.5              ggrepel_0.9.1              
## [229] clue_0.3-60                 rprojroot_2.0.2            
## [231] stabledist_0.7-1            tools_3.6.3                
## [233] sass_0.4.0                  nichenetr_1.1.1            
## [235] magrittr_2.0.1              RCurl_1.98-1.2             
## [237] proxy_0.4-24                car_3.0-11                 
## [239] ape_5.3                     ggplotify_0.0.5            
## [241] xml2_1.3.2                  httr_1.4.2                 
## [243] assertthat_0.2.1            rmarkdown_2.18             
## [245] boot_1.3-25                 globals_0.14.0             
## [247] R6_2.4.1                    Rhdf5lib_1.8.0             
## [249] nnet_7.3-14                 RcppHNSW_0.2.0             
## [251] progress_1.2.2              genefilter_1.68.0          
## [253] statmod_1.4.34              gtools_3.8.2               
## [255] shape_1.4.6                 HDF5Array_1.14.4           
## [257] BiocSingular_1.2.2          rhdf5_2.30.1               
## [259] splines_3.6.3               AUCell_1.8.0               
## [261] carData_3.0-4               colorspace_1.4-1           
## [263] generics_0.1.0              stats4_3.6.3               
## [265] base64enc_0.1-3             dynfeature_1.0.0           
## [267] smoother_1.1                gridtext_0.1.1             
## [269] pillar_1.6.3                tweenr_1.0.1               
## [271] sp_1.4-1                    ggplot.multistats_1.0.0    
## [273] rvcheck_0.1.8               GenomeInfoDbData_1.2.2     
## [275] plyr_1.8.6                  gtable_0.3.0               
## [277] zip_2.2.0                   knitr_1.41                 
## [279] ComplexHeatmap_2.14.0       latticeExtra_0.6-29        
## [281] biomaRt_2.42.1              IRanges_2.20.2             
## [283] fastmap_1.1.0               ADGofTest_0.3              
## [285] copula_1.0-0                doParallel_1.0.15          
## [287] AnnotationDbi_1.48.0        vcd_1.4-8                  
## [289] babelwhale_1.0.1            openssl_1.4.1              
## [291] scales_1.1.1                backports_1.2.1            
## [293] S4Vectors_0.24.4            ipred_0.9-12               
## [295] enrichplot_1.6.1            hms_1.1.1                  
## [297] ggforce_0.3.1               Rtsne_0.15                 
## [299] shiny_1.7.1                 numDeriv_2016.8-1.1        
## [301] polyclip_1.10-0             grid_3.6.3                 
## [303] lazyeval_0.2.2              Formula_1.2-3              
## [305] tsne_0.1-3                  crayon_1.3.4               
## [307] MASS_7.3-54                 pROC_1.16.2                
## [309] viridis_0.5.1               dynparam_1.0.0             
## [311] rpart_4.1-15                zinbwave_1.8.0             
## [313] compiler_3.6.3              ggtext_0.1.0
---
params:
  sample_name: "GSM3717034"
title: "GSE129611 dataset"
subtitle: "Sample `r params$sample_name`"
author: "Audrey"
date: "`r format(Sys.time(), '%Y-%m-%d')`"
output:
  html_document:
    code_folding: show
    code_download: true
    toc: true
    toc_float: true
    number_sections: false
---

<style>
body {
text-align: justify}
</style>

<!-- Automatically computes and prints in the output the running time for any code chunk -->
```{r, echo=FALSE}
# https://github.com/rstudio/rmarkdown/issues/1453
hooks = knitr::knit_hooks$get()
hook_foldable = function(type) {
  force(type)
  function(x, options) {
    res = hooks[[type]](x, options)
    
    if (isFALSE(options[[paste0("fold_", type)]])) return(res)
    
    paste0(
      "<details><summary>", "show", "</summary>\n\n",
      res,
      "\n\n</details>"
    )
  }
}
knitr::knit_hooks$set(
  output = hook_foldable("output"),
  plot = hook_foldable("plot"),
  time_it = local({
    now = NULL
    function(before, options) {
      if (options$time_it) {
        if (before) {
          now <<- Sys.time()
        } else {
          res = difftime(Sys.time(), now, units = "secs")
          paste("(Time to run :", round(res, digits = 2), "s)")
        }
      }
    }
  })
)
```

<!-- Set default parameters for all chunks -->
```{r, setup, include = FALSE}
set.seed(1337L)
knitr::opts_chunk$set(echo = TRUE, # display code
                      # display chunk output
                      message = FALSE,
                      warning = FALSE,
                      fold_output = FALSE, # usefull for sessionInfo()
                      fold_plot = FALSE,
                      
                      # figure settings
                      fig.align = 'center',
                      fig.width = 20,
                      fig.height = 15,
                      
                      # something about seed, chunk and Rmarkdown compilation
                      # https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script
                      # cache = TRUE,
                      cache.lazy = FALSE, 
                      
                      # add runtime after chunk
                      time_it = FALSE)
```


The goal of this script is to generate a Seurat object for sample `r params$sample_name`.

* removal of cells based on quality control metrics
* normalization with `LogNormalize`, then doublets detection using `scran hybrid` and `scDblFinder` method (but not filtering)
* normalization with `LogNormalize`, for only the remaining cells
* cell cycle and cell type annotation
* dimensionality reduction using `PCA`
* projection using `tSNE` and `UMAP`


```{r library}
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
```

# Preparation

In this section, we set the global settings of the analysis. We will store data there :

```{r out_dir}
out_dir = "."
```

We load the parameters :

```{r get_param}
sample_name = params$sample_name # "GSM3717034"
# sample_name = "GSM3717034"
```

Input count matrix is there :

```{r count_matrix_dir}
count_matrix_dir = paste0(out_dir, "/input/", sample_name, "/")
count_matrix_file = list.files(count_matrix_dir, full.names = TRUE)
count_matrix_file
```


We load the markers and specific colors for each cell type :

```{r cell_markers}
cell_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
```

Here are custom colors for each cell type :

```{r color_markers, fig.width = 12, fig.height = 1, class.source = "fold-hide"}
color_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())
```


We load markers to display on the dotplot :

```{r dotplot_markers}
dotplot_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
```


We load metadata for this sample :

```{r sample_info}
sample_info = readRDS(paste0(out_dir, "/../1_metadata/takahashi_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
```


These is a parameter for different functions :

```{r global_settings}
cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 0.5  # almost no filter
cut_nFeature_RNA = 20     # almost no filter
cut_percent.mt = 20
cut_percent.rb = 50
```

# Load count matrix

In this section, we load the count matrix.

```{r load_count_matrix, time_it = TRUE}
mat = read.table(count_matrix_file,
                 header = TRUE, row.names = 1)

# For the two 10X data, this is required
rownames(mat) = stringr::str_remove(rownames(mat),
                                    pattern = "hg19_")

# Seurat object
sobj = Seurat::CreateSeuratObject(counts = mat,
                                  project = sample_name,
                                  assay = "RNA")
rm(mat)
sobj
```


We add the same columns as in metadata :

```{r add_metadata}
row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]

colnames(sobj@meta.data)
```

# Before filtering

## Normalization

```{r normalization}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We generate a tSNE to visualize cells before filtering.

```{r pca_before, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE with 20 principal components :

```{r tsne_before}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"),
                       check_duplicates = FALSE)

sobj
```

## Cell type

We annotate cells for cell type using `Seurat::AddModuleScore` function.

```{r cell_annot_custom_short, time_it = TRUE}
sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
```

To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short, fig.width = 12, fig.height = 9}
markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```

## Cell cycle phase

We annotate cells for cell cycle phase using `Seurat` and `cyclone`.

```{r cell_cycle, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```

We visualize cell cycle on the projection :

```{r see_cell_cycle1, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```

# Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

```{r qc_metrics}
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")
sobj$log_nCount_RNA = log(sobj$nCount_RNA)

head(sobj@meta.data)
```

We get the cell barcodes for the failing cells :

```{r failed}
fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()
```


## Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

```{r fsobj}
fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))

fsobj
```

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

```{r normalization_doublets}
fsobj = Seurat::NormalizeData(fsobj,
                              normalization.method = "LogNormalize",
                              assay = "RNA")

fsobj = Seurat::FindVariableFeatures(fsobj,
                                     assay = "RNA",
                                     nfeatures = 3000)
fsobj
```

We identify doublet cells :

```{r doublet_detection, time_it = TRUE}
fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)

fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)
```

We add the information in the non filtered Seurat object :

```{r add_doublet_metrics}
sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)
```

```{r clean_fsobj, echo = FALSE}
rm(fsobj)
```


## Quality control representation

We can visualize the 4 cells quality with a Venn diagram : 

```{r qc_venn, class.source = "fold-hide", fig.width = 8, fig.height = 6}
n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))
```


### Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

```{r qc_umi_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)
```

```{r vln_umi_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_umi_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Number of features

To visualize the threshold for number of features, we can make a histogram :

```{r qc_features_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)
```


```{r vln_features_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_features_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

```{r qc_mito_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)
```

```{r vln_percentmt_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_mito_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

```{r qc_ribo_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)
```


```{r vln_percentrb_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_ribo_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the `log_nCount_RNA` by `nFeature_RNA` figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

```{r qc_patchwork_cell_type, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

```{r qc_patchwork_mito, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

```{r qc_patchwork_ribo, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`doublets_consensus.class`) :

```{r qc_patchwork_doublet_consensus, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`scDblFinder.class`) :

```{r qc_patchwork_scdblfinder, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`hybrid_score.class`) :

```{r qc_patchwork_hybrid_score, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```


### Visualization as piechart

Do filtered cells belong to a particular cell type ?

```{r qc_piechart_cell_type, class.source = "fold-hide", fig.width = 12, fig.height = 9}
sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
```

```{r clean_qc_4, echo = FALSE}
sobj$all_cells = NULL

rm(plot_list, df)
```


### Doublet cells status

We can compare doublet detection methods with a Venn diagram : 

```{r qc_venn_doublet, class.source = "fold-hide", fig.width = 8, fig.height = 6}
ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))
```

We visualize cells annotation for doublets :

```{r doublets_proj, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)
```

What is the composition of doublet cells ? We just look at score for each cell type.

```{r doublets_composition, class.source = "fold-hide"}
sobj$orig.ident.doublets = case_when(is.na(sobj$doublets_consensus.class) ~ "bad quality",
                                     sobj$doublets_consensus.class == TRUE ~ paste0(sobj$orig.ident, " doublets"),
                                     sobj$doublets_consensus.class == FALSE ~ "not doublet")
sobj$orig.ident.doublets = factor(sobj$orig.ident.doublets,
                                  levels = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                             "bad quality", "not doublet"))

doublets_compo = function(score1, score2) {
  type1 = unlist(lapply(stringr::str_split(score1, pattern = "score_"), `[[`, 2))
  type2 = unlist(lapply(stringr::str_split(score2, pattern = "score_"), `[[`, 2))
  
  if (type1 == type2) {
    the_title = "Homotypic doublet"
    the_subtitle = type1
    score1 = "log_nCount_RNA"
  } else {
    the_title = "Heterotypic doublet"
    the_subtitle = paste(type1, type2, sep = " + ")
  }
  
  p = sobj@meta.data %>%
    dplyr::arrange(desc(orig.ident.doublets)) %>%
    ggplot2::ggplot(., aes(x = eval(parse(text = score1)),
                           y = eval(parse(text = score2)),
                           col = orig.ident.doublets)) +
    ggplot2::geom_point(size = 0.25) +
    ggplot2::scale_color_manual(values = c(sample_info$color, "gray90", "gray60"),
                                breaks = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                           "bad quality", "not doublet")) +
    ggplot2::labs(x = score1, y = score2,
                  title = the_title, subtitle = the_subtitle) +
    ggplot2::theme_classic() +
    ggplot2::theme(aspect.ratio = 1,
                   plot.title = element_text(hjust = 0.5),
                   plot.subtitle = element_text(hjust = 0.5))
  
  return(p)
}
```

```{r doublets_patchwork_prep, class.source = "fold-hide"}
score_columns = grep(x = colnames(sobj@meta.data),
                     pattern = "^score",
                     value = TRUE)
combinations = expand.grid(score_columns, score_columns) %>%
  apply(., 1, sort) %>% t() %>%
  as.data.frame()
combinations = combinations[!duplicated(combinations), ]

plot_list = apply(combinations, 1, FUN = function(elem) {
  doublets_compo(elem[1], elem[2])
})

sobj$orig.ident.doublets = NULL
```


```{r doublets_patchwork, fig.width = 12, fig.height = 80, fold_plot = TRUE}
patchwork::wrap_plots(plot_list, ncol = 4) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
```

## Save

We could save this object before filtering (remove `eval = FALSE`) :

```{r save_sobj_unfiltered_annotated, eval = FALSE}
saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))
```


# Filtering

We remove :

* cells with a number of UMI lower than `r cut_log_nCount_RNA`
* cells expressing a number of genes lower than `r cut_nFeature_RNA`
* cells having more than `r cut_percent.mt` \% of UMI related to mitochondrial genes
* cells having more than `r cut_percent.rb` \% of UMI related to ribosomal genes

**Note**: We do not filter cells detected as doublets. Indeed, few genes and transcripts are detected per cell, and the best cells are therefore annotated as doublets.

```{r filter_cells}
sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb)))
sobj
```

```{r clean_filter, echo = FALSE}
rm(fail_doublets_consensus, fail_doublets_scDblFinder, fail_doublets_hybrid,
   fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA,
   cut_percent.mt, cut_percent.rb, cut_log_nCount_RNA, cut_nFeature_RNA)
```


# Post-filtering processing

## Normalization

We normalize the count matrix for remaining cells :

```{r normalization_3}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We perform a PCA :

```{r pca, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE and a UMAP with 20 principal components :

```{r tsne_umap}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"),
                       check_duplicates = FALSE)

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))
```


## Annotation

We annotate cells for cell type, with the new normalized expression matrix :

```{r cell_type_2, time_it = TRUE}
score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
```


To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short2, fig.width = 12, fig.height = 9}
markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype2, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```


### Cell cycle

We annotate cells for cell cycle phase :

```{r cell_cycle2, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```


We visualize cell cycle on the projection :

```{r see_cell_cycle2, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```


## Clustering

We make a highly resolutive clustering :

```{r clustering}
sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)

table(sobj$seurat_clusters)
```


# Visualization

## Cell type

We can visualize the cell type :

```{r see_cell_type, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```

## Cell cycle

We can visualize the cell cycle, from Seurat :

```{r see_cc_Seurat, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


We can visualize the cell cycle, from cyclone :

```{r see_cc_cyclone, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Clusters

We visualize the clustering :

```{r see_clusters, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Gene expression

We visualize all cell types markers on the tSNE :

```{r plot_genes, fig.width = 12, fig.height = 20}
markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)
```


# Save

We save the annotated and filtered Seurat object :

```{r save_sobj_filtered_annotated}
saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))
```


# R session

```{r sessioninfo, echo = FALSE, fold_output = TRUE}
sessionInfo()
```
